global root_dir = "`1'"

include "$root_dir/code/config/config.do"


cap noi log using ${log_dir}/sample_descriptives.log, replace name(dat)

*Handle empty arguments
global arg1 = cond("`2'" == "___EMPTY___", "", "`2'")
global arg2 = cond("`3'" == "___EMPTY___", "", "`3'")
global arg3 = cond("`4'" == "___EMPTY___", "", "`4'")
global arg4 = cond("`5'" == "___EMPTY___", "", "`5'")

if "$arg1" != "" {
    global weight_category "$arg1"
    di "Weight category: ${weight_category}"
}

if "$arg2" != "" {
    global weight_versions "$arg2"
    di "Weight versions: ${weight_versions}"
}

if "$arg3" != "" {
    global weight_window "$arg3"
    di "Weight window: ${weight_window}"
}

if "$arg4" != "" {
	global wtype "$arg4"
}
di "${wtype}"
capture noi {


* v16 Build a datafile to show statistics on patent applications, families and firms 

* -----------------------------------------------------------------------------
* 1) Tag firms in sample of final reg dataset (by their restrictions)
* -----------------------------------------------------------------------------

* import regression  data
use BvD year lse_id missing_weights_1995 maxweight_1995 missing_spill_weights_1995 auto95_bia using ${final_dir}/regression_dataset_from1970_tfacit1.dta, clear

* tag if firm has weights in 1995, is not domestic, has spillover weights, and does auto95 innovations in 1997 to 2011
gen f_has_weights = 0 
replace f_has_weights = 1 if missing_weights_1995 == 0 
gen f_multinat = 0
replace f_multinat = 1 if maxweight_1995 < 1
gen f_has_spillweights = 0 
replace f_has_spillweights = 1 if missing_spill_weights_1995 == 0
bys lse_id : egen _total_auto95_bia_1995 = sum(auto95_bia) if year>=1997 & year <= 2011
bys lse_id : egen total_auto95_bia_1995 = max(_total_auto95_bia_1995)
gen f_does_auto95_bia = 0
replace f_does_auto95_bia = 1 if total_auto95_bia_1995 > 0

* mark firms in baseline regression (singletons, e.g., in terms of industry-year, are dropped (or other multicollinearity issues))
mmerge BvD using ${final_dir}/bvd_list_regfirms_auto95.dta, unmatched(master)
gen f_in_reg = (_m == 3)

* combined indicator
gen f_in_sample = 0
replace f_in_sample = 1 if f_has_weights == 1 & f_multinat == 1 & f_does_auto95_bia == 1 & f_in_reg == 1
keep BvDIDnumber f_has_weights f_multinat f_has_spillweights f_does_auto95_bia f_in_reg f_in_sample lse_id
duplicates drop BvDIDnumber, force 
count if f_in_sample == 1

*labeling
label variable f_has_weights "Firm has weights in 1995"
label variable f_multinat "Firm is multinational"
label variable f_has_spillweights "Firm has spillover weights"
label variable f_does_auto95_bia "Firm does auto95 innovations"
label variable f_in_reg "Firm is in baseline regression (but may be dropped)"
label variable f_in_sample "Firm is not dropped in baseline regression"

tempfile firms_in_regdataset
save `firms_in_regdataset', replace

* -----------------------------------------------------------------------------
* 2) Combine machinery patents with classification and with patent information
* -----------------------------------------------------------------------------

* Application level

* load and define auto and pauto patents, include refined placebos and auto90
use ${dataset_dir}/patent_list/auto95_patents.dta, clear
gen auto95 = 1
append using ${dataset_dir}/patent_list/pauto95_patents.dta
gen pauto95 = 0
replace pauto95 = 1 if auto95 == .
replace auto95 = 0 if auto95 == .


*assert ensures we do not have accidental additional applications. because by the pgic of auto95 pauto95, we shouldnt
mmerge appln_id using  ${dataset_dir}/patent_list/pauto90_patents.dta
gen pauto90 = (_merge == 3)
assert _m != 2
drop _m

mmerge appln_id using  ${dataset_dir}/patent_list/pauto90_rm6_patents.dta
gen pauto90_rm6 = (_merge == 3)
assert _m != 2
drop _m

mmerge appln_id using  ${dataset_dir}/patent_list/auto90_patents.dta
gen auto90 = (_merge == 3)
assert _m != 2
drop _m

* keep only the correct ipr type ("PI", 2). Note: 17.26 percent are "UM" and 0.01 percent "DP" 
mmerge appln_id using ${commondata_dir}/patstat_2018b/appln_info.dta, unmatched(master) ukeep(ipr_type)
drop _m
keep if ipr_type == 2

* assert they are all machinery
mmerge appln_id using ${dataset_dir}/patent_list/pats_tfa.dta, unmatched(master)
assert (_m == 3) 
drop _m


* Family level
* merge family info
mmerge appln_id using ${commondata_dir}/patstat_2018b/family_info.dta, unmatched(master) ukeep(docdb_family_id fam_earliest_appln_year)

* merge citated once info (exluding itself)
mmerge docdb_family_id using ${commondata_dir}/patstat_2018b/citations_by_docdb_id_exclself.dta, unmatched(master) ukeep(cit_alltime cit_5yrs)
gen cit_5yrs_atleast = cit_5yrs > 0 & cit_5yrs < .

* merge families weighted by number of citations received within 5 years normalized by technological ﬁeld
* (Note: those with year "9999", i.e. missing, are not matched)
mmerge docdb_family_id using ${dataset_dir}/patstat_orbis/citations_by_docdb_id_exclself_normalized.dta, unmatched(master) ukeep(cit_5yrs_norm)
replace cit_5yrs_norm = 1 + cit_5yrs_norm
replace cit_5yrs_norm = 1 if cit_5yrs_norm == .
gen cit_merged=(_m==3)

* merge biadic info
mmerge docdb_family_id using ${dataset_dir}/patstat_orbis/docdb_families2.dta, unmatched(master) ukeep(biadic_D)
ren biadic_D bia

* -----------------------------------------------------------------------------
* 3) Merge with orbis and firm info from step 1)
* -----------------------------------------------------------------------------

log using ${numb_dir}/sample_descriptives_numbers.log, replace name(num)
* Merge with orbis
* (has 164 dupl obs in terms of appln_id-BvD => should be ok by only keeping BvD)
mmerge appln_id using ${dataset_dir}/patstat_orbis/Orbis_patents_list_2017_merged.dta, unmatched(master) ukeep(BvD) 
gen in_orbis=(_m==3) 
tab in_orbis 
duplicates report appln_id 
duplicates report

cap log close num

* Merge with firm information, count appl/family as belonging to sample if one firm that belongs in sample is assoc. with it
mmerge BvDIDnumber using `firms_in_regdataset', unmatched(master)
drop _m
bys docdb_family_id: egen f_in_sample_per_fam = sum(f_in_sample) if in_orbis == 1
bys appln_id: egen f_in_sample_per_appln = sum(f_in_sample) if in_orbis == 1

* Define dummy if in 1997-2011, restrict sample to that too
gen in_period = 0
replace in_period = 1 if fam_earliest_appln_year >= 1997 & fam_earliest_appln_year <= 2011
gen fam_in_sample = (f_in_sample_per_fam >= 1 & in_period == 1) if in_orbis == 1
gen appln_in_sample = (f_in_sample_per_appln >= 1 & in_period == 1) if in_orbis == 1

* -----------------------------------------------------------------------------
* 4) compute unique's on appln, family and firm level

* -----------------------------------------------------------------------------
* Keep the relevant variables, mark unique appln and docbd's
* (Note that multiple firms can apply for the same appln_id 
* (e.g. two swiss firms applied for appln_id 644 in 2008). I keep them for now but tag the first id in each group (i.e. get unique ones))

* application level:
by appln_id, sort: gen uniq_appln = _n == 1
gen uniq_appln_id = appln_id if uniq_appln == 1
by appln_id BvD, sort: gen uniq_firms_appln_comb = _n == 1 if in_orbis == 1
bys appln_id: egen firms_per_appln = sum(uniq_firms_appln_comb)

* family level:
by docdb_family_id, sort: gen uniq_fam = _n == 1
gen uniq_docdb_family_id = docdb_family_id if uniq_fam == 1
by docdb_family_id BvD, sort: gen uniq_firms_fam_comb = _n == 1 if in_orbis == 1
bys docdb_family_id: egen firms_per_fam = sum(uniq_firms_fam_comb) if in_orbis == 1
by docdb_family_id appln_id, sort: gen uniq_appln_fam_comb = _n == 1
bys docdb_family_id: egen appln_per_fam = sum(uniq_appln_fam_comb)

** firm level:
bys BvD: gen uniq_firm = _n == 1
bys BvD: egen fams_per_firm = sum(uniq_firms_fam_comb) if in_orbis == 1
bys BvD: egen appln_per_firm = sum(uniq_firms_appln_comb) if in_orbis == 1

* clean up and save
sort appln_id BvD

*labeling

save ${final_dir}/sample_descriptives.dta, replace


}
if _rc == 0 {
    display "Execution finished successfully."
}
else {
    display "Execution finished with errors."
}

cap log close dat